

********************************************************************************
* Replication Do-file
* Title       : Long-Term Health Impacts of the Eritrean-Ethiopian War 
*               on Young Ethiopian Adults
* Authors     : Dainn Wie, Demeke Yemareshet
* Affiliation : Tokyo, Japan
* Contact     : wie-dainn@grips.ac.jp, hailuyemar@gmail.com
* Dataset     : Ethiopian Demographic and Health Survey (EDHS) 2016
* Description : DHS Data Cleaning and Variable Preparation
********************************************************************************

*--------------------------------------*
* Set Environment and File Paths    *
*--------------------------------------*

set more off
cap log close


* Dainn's machine
*cd "C:\Users\wie-dainn\Dropbox\Work\Yema\Revision.R1.WDP\Replication package"

* Yemar's machine

cd "C:\Users\hailu\Dropbox\Yema\Revision.R1.WDP\Replication package"



********************************************************************************
* Load and Clean DHS 2016 Person Recode (PR) Data
********************************************************************************

use "Data\DHS 2016\PR\PR", clear

* Rename variables for clarity

rename hhid case_identification
rename hvidx line_number
rename hv000 country_code
rename hv002 household_number
rename hv003 resp_line_number
rename hv004 ultimate_unit
rename hv005 sample_weight
rename hv006 month_interview 
rename hv007 year_interview 
rename hv008 date_interview 
rename hv009 number_hhmembers
rename hv016 day_interview
rename hv024 region
rename hv044 hhselected_domviolence
rename hv218 line_household_head
rename hv219 sex_hh
rename hv220 age_hh
rename hv270 wealth_index 
rename hv271 wealthindex_factor 
rename hv270a wealthindex_urban 
rename hv271a wealthindexfa_urb

rename hv104 sex_members
rename hv105 age_members
rename hv106 highest_educ
rename hv108 highest_educ_years
rename hv111 mother_alive
rename hv112 mothers_line
rename hv113 father_alive
rename hv114 fathers_line
rename hv115 marital_status
rename idxh4 household_index

rename ha1 age_woman
rename ha32 cmc 
rename ha2 weight
rename ha3 height
rename ha4 height_age
rename ha5 height_agesd
rename ha11 weight_heightsd 
rename ha12b weight_heightpercen  
rename ha13 result_measurement 
rename ha35 smoking
rename ha40 bmi 
rename ha41 rohrer_index 
rename ha53 homoglobin 

rename hb1 man_age 
rename hb32 cmc_man
rename hb2 weightman
rename hb3 heightman
rename hb4 height_manage
rename hb5 height_magesd
rename hb11 weight_mheightsd 
rename hb12b weight_mheightpercen  
rename hb13 result_mameasurement 
rename hb35 smoking_man
rename hb40 bmi_man
rename hb41 rohrer_indexman
rename hb53 homoglobin_man



********************************************************************************
* Generate Education and Wealth Dummies
********************************************************************************

*-----------------------------*
* Education Dummies
*-----------------------------*


* Generate dummy variables for each value of highest_educ_years

tabulate highest_educ_years , gen(educ_dummy)

* Create dummy for secondary education and higher (education years 10–22)

egen educ_dummy_sec_hig = rowtotal(educ_dummy10 educ_dummy11 educ_dummy12 educ_dummy13 educ_dummy14 educ_dummy15 educ_dummy16 educ_dummy17 educ_dummy18 educ_dummy19 educ_dummy20 educ_dummy21 educ_dummy22)

label variable educ_dummy_sec_hig "Individual's completed education: Secondary level or Higher level"


* Create dummy for no education (year codes 1 and 23)

egen educ_dummy_noeduc = rowtotal(educ_dummy1 educ_dummy23)
label variable educ_dummy_noeduc "Individual's education: no education"

* Rename some of the remaining education categories for easier interpretation

rename educ_dummy2 educ_dummyg1 
rename educ_dummy3 educ_dummyg2
rename educ_dummy4 educ_dummyg3
rename educ_dummy5 educ_dummyg4
rename educ_dummy6 educ_dummyg5 
rename educ_dummy7 educ_dummyg6  
rename educ_dummy8 educ_dummyg7
rename educ_dummy9 educ_dummyg8

* Drop unnecessary education dummies

drop educ_dummy10 educ_dummy11 educ_dummy12 educ_dummy13 educ_dummy14 educ_dummy15 educ_dummy16 educ_dummy17 educ_dummy18 educ_dummy19 educ_dummy20 educ_dummy21 educ_dummy22 educ_dummy1 educ_dummy23



*-----------------------------*
* Wealth Index Dummies
*-----------------------------*

* Generate dummies from the categorical wealth_index variable

tabulate wealth_index, gen(wealth_dummy)

* Group 1: Poorest and poorer

egen wealth_dummy_poor = rowtotal(wealth_dummy1 wealth_dummy2)
label variable wealth_dummy_poor "Wealth index: poorest or poorer"

* Group 2: Richer and richest

egen wealth_dummy_rich = rowtotal(wealth_dummy4 wealth_dummy5)
label variable wealth_dummy_rich "Wealth index: richer or richest"

* Group 3: Middle (used as its own dummy)

rename wealth_dummy3 wealth_dummy_middle
label variable wealth_dummy_middle "Wealth index: middle"

* Drop unnecessary dummies (keeping only the grouped variables)

drop wealth_dummy1 wealth_dummy2 wealth_dummy4 wealth_dummy5


********************************************************************************
* Variable Recoding and Labeling
********************************************************************************

*-----------------------------*
* Household & Demographic Info
*-----------------------------*


label var age_hh "household head age"
label var age_members "individual's age"
label var marital_status "0=never mar 1=married 2=living tog 3=widowed 4=divorced"
label var region "1= tigray 2=affar 3=amhara 4=oromiya 5=somali 6=ben-gumz 7=snnp 8=gambela 9=harari 10=addis 11=dire dawa"
label var line_household_head "line number of head of household"
recode sex_hh (2=0)
label var sex_hh " 1=male 0=female"
label var age_hh "age of household head (years)" 
label var wealthindex_urban "Wealth Index Urban"
label var wealthindexfa_urb "Wealth Index Factor Urban"
recode sex_members (2=0)
label var sex_members " 1=male 0=female"
label var age_members "age of household members (years)" 
label var wealthindex_urban "1=poorest 2=poorer 3=middle 4=richer 5=richest"
label var hhselected_domviolence "0=househol 1=househol"
label var hhselected_domviolence "0=househol 1=househol"


********************************************************************************
* Keep Only Relevant Variables for Analysis
********************************************************************************


keep case_identification line_number country_code cluster_number household_number resp_line_number ultimate_unit sample_weight month_interview  year_interview date_interview day_interview number_hhmembers region hhselected_domviolence line_household_head sex_hh age_hh wealth_index wealthindex_factor wealthindex_urban wealthindexfa_urb sex_members age_members highest_educ mother_alive mothers_line father_alive fathers_line marital_status household_index age_woman cmc weight height height_age height_agesd weight_heightsd  weight_heightpercen result_measurement smoking bmi rohrer_index homoglobin man_age cmc_man weightman heightman height_manage height_magesd weight_mheightsd weight_mheightpercen result_mameasurement smoking_man bmi_man rohrer_indexman homoglobin_man wealth_dummy_poor wealth_dummy_middle wealth_dummy_rich educ_dummy_noeduc educ_dummyg1 educ_dummyg2 educ_dummyg3 educ_dummyg4 educ_dummyg5 educ_dummyg6 educ_dummyg7 educ_dummyg8 educ_dummy_sec_hig


********************************************************************************
* Merge DHS with Geographic Data (GPS) and Harmonize Anthropometrics
********************************************************************************


*-----------------------------*
* Merge: Geographic Data (GPS)
*-----------------------------*

merge m:1 cluster_number using "Data\DHS Geographic\GE"


*-----------------------------*
* Harmonize Anthropometric Data (Women + Men)
*-----------------------------*

* Replace missing values from men's dataset if unavailable in women's data

replace cmc = cmc_man if missing( cmc)
replace weight = weightman if missing( weight )
replace height = heightman if missing( height )
replace height_age = height_manage if missing(  height_age )
replace height_agesd  = height_magesd if missing( height_agesd )
replace weight_heightsd = weight_mheightsd if missing(weight_heightsd )
replace weight_heightpercen = weight_mheightpercen if missing( weight_heightpercen )
replace result_measurement = result_mameasurement if missing( result_measurement )
replace smoking = smoking_man if missing( smoking )
replace bmi = bmi_man if missing( bmi )
replace rohrer_index = rohrer_indexman if missing( rohrer_index )
replace homoglobin = homoglobin_man if missing( homoglobin )



*-----------------------------*
* Drop Unneeded Variables and Keep Final Set
*-----------------------------*

keep case_identification line_number country_code cluster_number household_number resp_line_number ultimate_unit sample_weight month_interview  year_interview date_interview day_interview number_hhmembers region hhselected_domviolence line_household_head sex_hh age_hh wealth_index wealthindex_factor wealthindex_urban wealthindexfa_urb sex_members age_members highest_educ mother_alive mothers_line father_alive fathers_line marital_status household_index cmc weight height height_age height_agesd weight_heightsd weight_heightpercen result_measurement smoking bmi rohrer_index homoglobin wealth_dummy_poor wealth_dummy_middle wealth_dummy_rich DHSID DHSCC DHSYEAR CCFIPS ADM1FIPS ADM1FIPSNA ADM1SALBNA ADM1SALBCO ADM1DHS ADM1NAME DHSREGCO DHSREGNA SOURCE URBAN_RURA LATNUM LONGNUM ALT_GPS ALT_DEM DATUM _merge educ_dummy_noeduc educ_dummyg1 educ_dummyg2 educ_dummyg3 educ_dummyg4 educ_dummyg5 educ_dummyg6 educ_dummyg7 educ_dummyg8 educ_dummy_sec_hig


*-----------------------------*
* Save Merged & Cleaned Dataset
*-----------------------------*

save "Data\DHS 2016\PR\merge2", replace


********************************************************************************
* Identify Non-Migrants: Merge PR with KR (Women) and MR (Men) Datasets
********************************************************************************


*-----------------------------*
* Step 1: Merge with Women's Info (KR)
*-----------------------------*

* Load KR file (children's recode, includes mother's info)

use "Data\DHS 2016\KR\KR", clear 


sort caseid
duplicates report caseid
duplicates drop caseid, force

* Rename to match PR identifiers

rename resp_line_number line_number

* Keep only migration-related variables
keep caseid midx v000 cluster_number household_number line_number v104 v105 v105a


* Save reduced KR file

save "Data\DHS 2016\KR\new", replace

* Load previously merged PR file

use "Data\DHS 2016\PR\merge2", clear

drop _merge

* Merge PR with KR

merge 1:1 cluster_number household_number line_number using "Data\DHS 2016\KR\new"

* Save intermediate output

save "Data\DHS 2016\PR\merge2", replace



*-----------------------------*
* Step 2: Merge with Men's Info (MR)
*-----------------------------*


use "Data\DHS 2016\MR\MR", clear

sort mcaseid
duplicates report mcaseid
duplicates drop mcaseid, force

* Rename to match PR identifiers

rename mv001 cluster_number 
rename mv002 household_number 
rename mv003 line_number

* Keep only relevant migration-related variables

keep mcaseid cluster_number household_number line_number mv104 mv105 mv105a

save "Data\DHS 2016\MR\new.DTA", replace

* Reload PR+KR merged dataset

use "Data\DHS 2016\PR\merge2", clear
drop _merge

* Merge with MR data

merge 1:1 cluster_number household_number line_number using "Data\DHS 2016\MR\new.DTA"


*-----------------------------*
* Step 3: Harmonize Migration Variables
*-----------------------------*


* Fill missing migration values from MR (if not in KR)

replace v104 = mv104 if missing( v104 )
replace v105 = mv105 if missing( v105 )
replace v105a = mv105a if missing( v105a )

* Rename and label migration variables

rename v104 years_lived
rename v105 type_previous
rename v105a reg_previous

label var years_lived "years lived in place of residence"
label var type_previous "type of place of previous residence"
label var reg_previous "region of previous residence"


*-----------------------------*
* Step 4: Final Keep and Save
*-----------------------------*


keep case_identification line_number country_code cluster_number household_number resp_line_number ultimate_unit sample_weight month_interview  year_interview date_interview day_interview number_hhmembers region hhselected_domviolence line_household_head sex_hh age_hh wealth_index wealthindex_factor wealthindex_urban wealthindexfa_urb sex_members age_members highest_educ mother_alive mothers_line father_alive fathers_line marital_status household_index cmc weight height height_age height_agesd weight_heightsd weight_heightpercen result_measurement smoking bmi rohrer_index homoglobin  wealth_dummy_poor wealth_dummy_middle wealth_dummy_rich DHSID DHSCC DHSYEAR CCFIPS ADM1FIPS ADM1FIPSNA ADM1SALBNA ADM1SALBCO ADM1DHS ADM1NAME DHSREGCO DHSREGNA SOURCE URBAN_RURA LATNUM LONGNUM ALT_GPS ALT_DEM DATUM _merge years_lived type_previous reg_previous educ_dummy_noeduc educ_dummyg1 educ_dummyg2 educ_dummyg3 educ_dummyg4 educ_dummyg5 educ_dummyg6 educ_dummyg7 educ_dummyg8 educ_dummy_sec_hig



* Save final merged dataset with migration info

save "Data\DHS 2016\PR\merge2", replace




********************************************************************************
* Convert Year of Birth from Ethiopian to Gregorian Calendar
********************************************************************************

*-----------------------------*
* Step 1: Estimate Year and Month of Birth
*-----------------------------*


gen year_birth = int( ( cmc - 1 )/12 ) + 1900
gen month_birth = cmc - ( ( year_birth - 1900 ) * 12)

* Drop observations with missing birth year
drop if missing( year_birth) 


*-----------------------------*
* Step 2: Convert to Gregorian Calendar
*-----------------------------*

gen e_day=1
eth2grecal, et_year( year_birth) et_month( month_birth) et_day(e_day)

* Note: eth2grecal creates gregorian variables named gre_year, gre_month, gre_day


*-----------------------------*
* Step 3: Save Dataset
*-----------------------------*


save "Data\DHS 2016\PR\merge2", replace



********************************************************************************
* ACLED Conflict Data: Clean and Extract Districts Affected by Conflict
********************************************************************************


*-----------------------------*
* Step 1: Import ACLED Dataset
*-----------------------------*


import excel "Data\ACLED\Africa_1997-2023_Jun16.xlsx", sheet("Sheet1") firstrow clear

* Keep only Ethiopia

keep if COUNTRY == "Ethiopia"


*-----------------------------*
* Step 2: Rename and Label Key Variables
*-----------------------------*


rename ADMIN1 region
label var region "1= Tigray 2=Afar 3=Amhara 4=Oromia 5=Somali 6=Benshangul/Gumuz 7=SNNP 8=Gambela 9=Harari 10=Addis Ababa 11=Dire Dawa"
label var region "Administrative Region"
label var ADMIN2 "Administrative Zone"
label var ADMIN3 "Adminstrative District"


*-----------------------------*
* Step 3: Keep Only Conflict Years: 1998–2000
*-----------------------------*


keep if YEAR == 1998 | YEAR == 1999 | YEAR == 2000


*-----------------------------*
* Step 4: Keep Specific Conflict towns
*-----------------------------*

keep if ADMIN3 == "Zala Anbesa town" | ADMIN3 == "Elidar" | ADMIN3 == "Erob" | ADMIN3 == "Adigrat town" | ADMIN3 == "Saharti"

* Sort by district and year, then keep first event per district

sort ADMIN3 YEAR 
by ADMIN3 : keep if _n == 1


*-----------------------------*
* Step 5: Reshape to Wide Format
*-----------------------------*

egen id=group(LOCATION)
keep ADMIN3 id LATITUDE LONGITUDE
rename ADMIN3 district
gen n=1

reshape wide district LATITUDE LONGITUDE, i(n) j(id)


*-----------------------------*
* Step 6: Save Cleaned Conflict Location File
*-----------------------------*

save "Data\ACLED\edit", replace



********************************************************************************
* Merge DHS Data with ACLED Conflict Data & Calculate Distance to Conflict Sites
********************************************************************************


* Load cleaned DHS dataset

use "Data\DHS 2016\PR\merge2", clear

* Create a constant variable for merging (since ACLED file has one observation per conflict site)
gen n=1

* Remove previous merge indicator if exists
drop _merge

* Merge DHS with ACLED conflict locations (m:1 merge by constant 'n')

merge m:1 n using "Data\ACLED\edit"

* Drop helper variable after merge
drop n

* Save intermediate dataset

save "Data\DHS 2016\PR\merge2", replace


*-----------------------------*
* Calculate Distances to Each Conflict Location
*-----------------------------*


* Calculate geodistances (great-circle distances) between DHS clusters and conflict locations
* Using geodist command, generating distance variables in kilometers (default)


geodist LATNUM LONGNUM LATITUDE1 LONGITUDE1, gen(distance1)

geodist LATNUM LONGNUM LATITUDE2 LONGITUDE2, gen(distance2)

geodist LATNUM LONGNUM LATITUDE3 LONGITUDE3, gen(distance3)

geodist LATNUM LONGNUM LATITUDE4 LONGITUDE4, gen(distance4)

geodist LATNUM LONGNUM LATITUDE5 LONGITUDE5, gen(distance5)



*-----------------------------*
* Compute Minimum Distance to Any Conflict Site
*-----------------------------*

egen min_distance = rowmin( distance1 distance2 distance3 distance4 distance5)


*-----------------------------*
* Add Fatality Counts (Optional / from ACLED summary)
*-----------------------------*

gen FATALITIES1 = 1851 
gen FATALITIES2 = 3692
gen FATALITIES3 = 2451
gen FATALITIES4 = 1846 
gen FATALITIES5 = 6479 


* Save final dataset with distance variables

save "Data\DHS 2016\PR\merge2", replace



********************************************************************************
* Sample Cleaning and Variable Construction
********************************************************************************


use "Data\DHS 2016\PR\merge2", clear


*-----------------------------*
* Filter: Keep only individuals born after 1990 (exclude early cohorts)
*-----------------------------*

drop if g_year < 1990



*-----------------------------*
* Clean Height, Weight, BMI, and Age Variables (drop invalid/outlier values)
*-----------------------------*


* Height, drop missing or implausible (> 9000)

summarize height, detail
drop if height == . | height > 9000

* Weight, drop missing or implausible (> 1400)

summarize weight, detail
drop if weight == . | weight > 1400

* BMI, drop missing or implausible (> 5000)

summarize bmi, detail
drop if bmi == . | bmi > 5000

* Age of individual, drop missing or implausible (> 26)

summarize age_members, detail
drop if age_members == . | age_members > 26


*-----------------------------*
* Convert to More Intuitive Units
*-----------------------------*


gen height_cm = height / 10
gen weight_kg = weight / 10
gen bmi_ = bmi/ 100

label var height_cm "individual height in cm"
label var weight_kg "individual weight in kg"
label var bmi_ "individual body mass index"


*-----------------------------*
* Create Urban/Rural Binary Variable
*-----------------------------*


gen location_urban = ( URBAN_RURA == "U")
label var location_urban "0= Rural 1= Urban"



********************************************************************************
* Create Distance Categories, In-Utero Exposure, and Interaction Variables
********************************************************************************

*-----------------------------*
* Distance category dummies (km bands)
*-----------------------------*



gen distance_50km = (min_distance <= 50)
gen distance_100km = (min_distance > 50 & min_distance <= 100)
gen distance_150km = (min_distance > 100 & min_distance <= 150)
gen distance_200km = (min_distance > 150 & min_distance <= 200)

gen distance_1 = (min_distance <= 100)
gen distance_2 = (min_distance > 100 & min_distance <= 200)
gen distance_3 = (min_distance > 200 & min_distance <= 300)


*-----------------------------*
* Label distance variables
*-----------------------------*


label variable distance_50km "with in 50km (1) or >= 50km (0)"
label variable distance_100km "with in 50-100km (1) or not (0)"
label variable distance_150km "with in 100-150km (1) or not (0)"
label variable distance_100km "with in 150-200km (1) or not (0)"
	   		  
label variable distance_1 "with in 100km (1) or >= 100km (0)"
label variable distance_2 "with in 100-200km (1) or not (0)"
label variable distance_3 "with in 200-300km (1) or not (0)"
	

*-----------------------------*
* Generate in utero exposure variable
* Assuming g_year and g_month are Gregorian birth year and month
* Birth in or after June 1998 considered exposed in utero
*-----------------------------*


gen uteroexp = ( g_year > 1998) | (( g_year == 1998) & ( g_month >= 6))
label variable uteroexp "In Utero (1) or After Birth (0)"


*-----------------------------*
* Interaction terms: Distance * In utero exposure
*-----------------------------*

gen interaction50km = distance_50km*uteroexp
gen interaction100km = distance_100km*uteroexp
gen interaction150km = distance_150km*uteroexp
gen interaction200km = distance_200km*uteroexp


gen interaction = distance_1*uteroexp
gen interaction2 = distance_2*uteroexp
gen interaction3 = distance_3*uteroexp


*-----------------------------*
* Label interaction variables
*-----------------------------*


label variable interaction50km "Exposure within 50km in utero"
label variable interaction100km "Exposure within 50km-100km in utero"
label variable interaction150km "Exposure within 100km-150km in utero"
label variable interaction200km "Exposure within 150km-200km in utero"


label variable interaction "Exposure within 100km in utero"
label variable interaction2 "Exposure within 100km-200km in utero"
label variable interaction3 "Exposure within 200km-300km in utero"


*-----------------------------*
* Rename variables for clarity
*-----------------------------*


rename cluster_number area
rename g_year year



*-----------------------------*
* Save final dataset for analysis
*-----------------------------*


save "merge3", replace




